﻿// +----------------------------------------------------------------------
// | Update: 2022-12-08 10:24
// +----------------------------------------------------------------------
// | Author: Kerindax <1482152356@qq.com>
// +----------------------------------------------------------------------
#include <string>
#include <regex>
#include <map>
#include <vector>

using namespace std;

namespace std
{
    //补充函数
    static wstring trim(const wstring& s)
    {
        int ibegin = s.find_first_not_of(L" \t\r\n");
        if (ibegin == wstring::npos) return L"";
        int iend = s.find_last_not_of(L" \t\r\n");
        return s.substr(ibegin, iend - ibegin + 1);
    }
    static void replace(wstring& str, const wstring& src, const wstring& des)
    {
        int pos = -1;
        int curPos = 0;
        while (-1 != (pos = str.find(src, curPos)))
        {
            str.replace(pos, src.size(), des);
            curPos = pos + des.size();
        }
    }
    //补充函数,兼容c++11
    template<class BidirIt, class Traits, class CharT, class UnaryFunction>
    basic_string<CharT> regex_replace(BidirIt first, BidirIt last,
        const basic_regex<CharT, Traits>& re, UnaryFunction f)
    {
        basic_string<CharT> s;

        typename match_results<BidirIt>::difference_type
            positionOfLastMatch = 0;
        auto endOfLastMatch = first;

        auto callback = [&](const match_results<BidirIt>& match)
        {
            auto positionOfThisMatch = match.position(0);
            auto diff = positionOfThisMatch - positionOfLastMatch;

            auto startOfThisMatch = endOfLastMatch;
            advance(startOfThisMatch, diff);

            s.append(endOfLastMatch, startOfThisMatch);
            s.append(f(match));

            auto lengthOfMatch = match.length(0);

            positionOfLastMatch = positionOfThisMatch + lengthOfMatch;

            endOfLastMatch = startOfThisMatch;
            advance(endOfLastMatch, lengthOfMatch);
        };

        regex_iterator<BidirIt> begin(first, last, re), end;
        for_each(begin, end, callback);

        s.append(endOfLastMatch, last);

        return s;
    }
    template<class Traits, class CharT, class UnaryFunction>
    wstring regex_replace(const wstring& s,
        const basic_regex<CharT, Traits>& re, UnaryFunction f)
    {
        return regex_replace(s.cbegin(), s.cend(), re, f);
    }

}

class SpecialObject
{
public:
    wchar_t *basic;
    wchar_t *extend;
    wchar_t *link;
    SpecialObject(wchar_t*, wchar_t*, wchar_t*);
};
SpecialObject::SpecialObject(wchar_t*basic, wchar_t*extend, wchar_t*link) {
    this->basic = basic;
    this->extend = extend;
    this->link = link;
};
class UyghurCharUtils
{

public:
	UyghurCharUtils();
	wstring Basic2Extend(wstring source);
	wstring Extend2Basic(wstring source);
	wstring Basic2RExtend(wstring source);
	wstring RExtend2Basic(wstring source);
	wstring BasicSyllable(wstring source);

private:
    const int BASIC = 0; //基本区形式  A
    const int ALONE = 1; //单独形式    A
    const int HEAD = 2; //头部形式    A_
    const int CENTR = 3; //中部形式   _A_
    const int REAR = 4; //后部形式   _A
    // 双目字列表，转换扩展区的时候需要替换
    SpecialObject* special[2] = {
       new SpecialObject(new wchar_t[3] { 0x644, 0x627 }, new wchar_t[2] { 0xfefc },new wchar_t[3] { 0xfee0, 0xfe8e }),
       new SpecialObject(new wchar_t[3] { 0x644, 0x627 }, new wchar_t[2] { 0xfefb  },new wchar_t[3] { 0xfedf, 0xfe8e })
    };
    // 单字母列表
    map<wstring, vector<wstring>> charCode;
    wstring fromCharCode(wchar_t number) {
        return wstring(new wchar_t[2] {number});
    }

	wstring reverseAscii(wstring source);
	wstring reverseSubject(wstring source);
	wstring getChar(wstring ch, int index);
	wstring extendLa(wstring source);
	wstring basicLa(wstring source);
	wstring getString(wchar_t *value);
};

UyghurCharUtils::UyghurCharUtils()
{
    wchar_t arr[][5] = {
        {0x626, 0xfe8b, 0xfe8b, 0xfe8c, 0xfe8c}, // 1 --- 00-Hemze
        {0x627, 0xfe8d, 0xfe8d, 0xfe8e, 0xfe8e}, // 0 --- 01-a
        {0x6d5, 0xfee9, 0xfee9, 0xfeea, 0xfeea}, // 0 --- 02-:e
        {0x628, 0xfe8f, 0xfe91, 0xfe92, 0xfe90}, // 1 --- 03-b
        {0x67e, 0xfb56, 0xfb58, 0xfb59, 0xfb57}, // 1 --- 04-p
        {0x62a, 0xfe95, 0xfe97, 0xfe98, 0xfe96}, // 1 --- 05-t
        {0x62c, 0xfe9d, 0xfe9f, 0xfea0, 0xfe9e}, // 1 --- 06-j
        {0x686, 0xfb7a, 0xfb7c, 0xfb7d, 0xfb7b}, // 1 --- 07-q
        {0x62e, 0xfea5, 0xfea7, 0xfea8, 0xfea6}, // 1 --- 08-h
        {0x62f, 0xfea9, 0xfea9, 0xfeaa, 0xfeaa}, // 0 --- 09-d
        {0x631, 0xfead, 0xfead, 0xfeae, 0xfeae}, // 0 --- 10-r
        {0x632, 0xfeaf, 0xfeaf, 0xfeb0, 0xfeb0}, // 0 --- 11-z
        {0x698, 0xfb8a, 0xfb8a, 0xfb8b, 0xfb8b}, // 0 --- 12-:zh
        {0x633, 0xfeb1, 0xfeb3, 0xfeb4, 0xfeb2}, // 1 --- 13-s
        {0x634, 0xfeb5, 0xfeb7, 0xfeb8, 0xfeb6}, // 1 --- 14-x
        {0x63a, 0xfecd, 0xfecf, 0xfed0, 0xfece}, // 1 --- 15-:gh
        {0x641, 0xfed1, 0xfed3, 0xfed4, 0xfed2}, // 1 --- 16-f
        {0x642, 0xfed5, 0xfed7, 0xfed8, 0xfed6}, // 1 --- 17-:k
        {0x643, 0xfed9, 0xfedb, 0xfedc, 0xfeda}, // 1 --- 18-k
        {0x6af, 0xfb92, 0xfb94, 0xfb95, 0xfb93}, // 1 --- 19-g
        {0x6ad, 0xfbd3, 0xfbd5, 0xfbd6, 0xfbd4}, // 1 --- 20-:ng
        {0x644, 0xfedd, 0xfedf, 0xfee0, 0xfede}, // 1 --- 21-l
        {0x645, 0xfee1, 0xfee3, 0xfee4, 0xfee2}, // 1 --- 22-m
        {0x646, 0xfee5, 0xfee7, 0xfee8, 0xfee6}, // 1 --- 23-n
        {0x6be, 0xfbaa, 0xfbac, 0xfbad, 0xfbab}, // 1 --- 24-:h
        {0x648, 0xfeed, 0xfeed, 0xfeee, 0xfeee}, // 0 --- 25-o
        {0x6c7, 0xfbd7, 0xfbd7, 0xfbd8, 0xfbd8}, // 0 --- 26-u
        {0x6c6, 0xfbd9, 0xfbd9, 0xfbda, 0xfbda}, // 0 --- 27-:o
        {0x6c8, 0xfbdb, 0xfbdb, 0xfbdc, 0xfbdc}, // 0 --- 28-v
        {0x6cb, 0xfbde, 0xfbde, 0xfbdf, 0xfbdf}, // 0 --- 29-w
        {0x6d0, 0xfbe4, 0xfbe6, 0xfbe7, 0xfbe5}, // 1 --- 30-e
        {0x649, 0xfeef, 0xfbe8, 0xfbe9, 0xfef0}, // 1 --- 31-i
        {0x64a, 0xfef1, 0xfef3, 0xfef4, 0xfef2}, // 1 --- 32-y

        {0x6c5, 0xfbe0, 0xfbe0, 0xfbe1, 0xfbe1}, // 0 --- kz o_
        {0x6c9, 0xfbe2, 0xfbe2, 0xfbe3, 0xfbe3}, // 0 --- kz o^
        {0x62d, 0xfea1, 0xfea3, 0xfea4, 0xfea2}, // 1 --- kz h
        {0x639, 0xfec9, 0xfecb, 0xfecc, 0xfeca}, // 1 --- kz c
    };
    
    for (const auto & row : arr)
    {
        vector<wstring> list;
        for (auto el : row) {
            list.push_back(fromCharCode(el));
        }
        for (auto item : list) {
            auto it = charCode.find(item);
            if (it == charCode.end()) {
                charCode.insert(pair<wstring, vector<wstring>>(item, list));
            }
        }
    }
}
/// <summary>
/// 基本区   转换   扩展区
/// </summary>
/// <param name="source">要转换的内容，可以包含混合字符串</param>
/// <returns>已转换的内容</returns>
inline wstring UyghurCharUtils::Basic2Extend(wstring source)
{
    //转换范围；不包含哈语的0x0621字母,问号,双引号和Unicode区域的符号
    wstring convertRang = L"[\\u0622-\\u064a\\u0675-\\u06d5]+";
    //分割范围，有后尾的字符表达式
    wstring suffixRang = L"[^\\u0627\\u062F-\\u0632\\u0648\\u0688-\\u0699\\u06C0-\\u06CB\\u06D5]";

    return regex_replace(source, wregex(convertRang), [&](const wsmatch& word) {
        wstring returns = regex_replace(word.str(0), wregex(suffixRang), [](const wsmatch& m) {
            return m.str(0) + L"  ";
        });
        returns = regex_replace(trim(returns), wregex(L"(^|[^[:space:]])([^[:space:]])(?=$|[^[:space:]])"), [&](const wsmatch& m) {
            return m.str(1) + this->getChar(m.str(2), this->ALONE);
            });
        returns = regex_replace(returns, wregex(L"([^[:space:]]|^)([^[:space:]])[[:space:]]"), [&](const wsmatch& m) {
            return m.str(1) + this->getChar(m.str(2), this->HEAD);
            });
        returns = regex_replace(returns, wregex(L"[[:space:]]([^[:space:]])[[:space:]]"), [&](const wsmatch& m) {
            return this->getChar(m.str(1), this->CENTR);
            });
        returns = regex_replace(returns, wregex(L"[[:space:]]([^[:space:]])(?=[^[:space:]]|$)"), [&](const wsmatch& m) {
            return this->getChar(m.str(1), this->REAR);
            });
        return this->extendLa(returns);;
        });
}
/// <summary>
/// 扩展区   转换   基本区
/// </summary>
/// <param name="source">要转换的内容</param>
/// <returns>已转换的内容</returns>
inline wstring UyghurCharUtils::Extend2Basic(wstring source)
{
    //扩展区范围；FB50-FDFF ->区域A    FE70-FEFF -> 区域B
    wstring extendRang = L"[\\ufb50-\\ufdff\\ufe70-\\ufeff]";
    return regex_replace(this->basicLa(source), wregex(extendRang), [&](const wsmatch& m) {
        return this->getChar(m.str(0), this->BASIC);
        });
}
/// <summary>
///  基本区  转换   反向扩展区
/// </summary>
/// <param name="source">要转换的内容</param>
/// <returns>已转换的内容</returns>
inline wstring UyghurCharUtils::Basic2RExtend(wstring source)
{
    return this->reverseAscii(this->reverseSubject(this->Basic2Extend(source)));
}
/// <summary>
/// 反向扩展区   转换   基本区
/// </summary>
/// <param name="source">要转换的内容</param>
/// <returns>已转换的内容</returns>
inline wstring UyghurCharUtils::RExtend2Basic(wstring source)
{
    return this->Extend2Basic(this->reverseSubject(this->reverseAscii(source)));
}
/// <summary>
/// 音节索引
/// </summary>
inline wstring UyghurCharUtils::BasicSyllable(wstring source)
{
    // 音节切开专用，取韵母
    wstring finalsRang = L"([\\u0627\\u06d5\\u0648\\u06c7\\u06c6\\u06c8\\u06d0\\u0649\\u06c9\\u06c5])([^\\u0627\\u06d5\\u0648\\u06c7\\u06c6\\u06c8\\u06d0\\u0649\\u06c9\\u06c5]+)(?=[\\u0627\\u06d5\\u0648\\u06c7\\u06c6\\u06c8\\u06d0\\u0649\\u06c9\\u06c5])";
    return regex_replace(source, wregex(L"[^[:space:]]+"), [&](const wsmatch& m) {
            return regex_replace(m.str(0), wregex(finalsRang), [&](const wsmatch& m) {
                wstring ch2 = m.str(2);
                int index = floor(ch2.length() / 2);
                return m.str(1) + ch2.substr(0, index) + L" " + ch2.substr(index);
            });
        });
}
/// <summary>
/// Ascii区反转
/// </summary>
inline wstring UyghurCharUtils::reverseAscii(wstring source)
{
    // 特助转换区，扩展区反向转换的时候需要替换
    wstring symbolRang = L"[}{><»«\\)\\(\\]\\[]";
    map<wstring, wstring> symbolList =  {
        {L")", L"("},
        {L"(", L")"},
        {L"]", L"["},
        {L"[", L"]"},
        {L"}", L"{"},
        {L"{", L"}"},
        {L">", L"<"},
        {L"<", L">"},
        {L"»", L"«"},
        {L"«", L"»"}
    };
    // 不包含扩展区中部包含空格字符集；FB50-FDFF ->区域A FE70-FEFF -> 区域B
    wstring notExtendRang = L"[^\\ufb50-\\ufdff\\ufe70-\\ufeff[:space:]]+([[:space:]][^\\ufb50-\\ufdff\\ufe70-\\ufeff[:space:]]+)*";
    return regex_replace(source, wregex(notExtendRang), [&](const wsmatch& m) {
        wstring word = m.str(0);
        reverse(word.begin(), word.end());
        return regex_replace(word, wregex(symbolRang), [&](const wsmatch& m) {
            wstring ch = m.str(0);
            auto it = symbolList.find(ch);
            if (it != symbolList.end()) {
                return it->second;
            } else return ch;
        });
    });
}
/// <summary>
/// 对象反转
/// </summary>
inline wstring UyghurCharUtils::reverseSubject(wstring str)
{
    return regex_replace(str, wregex(L".+"), [&](const wsmatch& m) {
        wstring subject = m.str(0);
        reverse(subject.begin(), subject.end());
        return subject;
    });
}
/// <summary>
/// /// 获取对应字母
/// /// </summary>
inline wstring UyghurCharUtils::getChar(wstring ch, int index)
{
    map<const wstring, vector<wstring>>::iterator it;
    it = this->charCode.find(ch);
    if (it != this->charCode.end()) {
            return it->second[index];
    }
    else return ch;
}
/// <summary>
/// /// La字母转换扩展区
/// /// </summary>
inline wstring UyghurCharUtils::extendLa(wstring source)
{
    for (const auto& item: this->special) {
        replace(source, this->getString(item->link), this->getString(item->extend));
    }
	return source;
}
/// <summary>
/// La字母转换基本区
/// </summary>
inline wstring UyghurCharUtils::basicLa(wstring source)
{
    for (const auto& item : this->special) {
        replace(source, this->getString(item->extend), this->getString(item->basic));
    }
    return source;
}
/// <summary>
/// 双目字母转换字符串
/// </summary>
inline wstring UyghurCharUtils::getString(wchar_t *value)
{
	return value;
}

